{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# nvCOMP Python API Basics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import cupy as cp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Download example files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('mobydick.txt', <http.client.HTTPMessage at 0x7f9022b42ef0>)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import urllib.request\n",
    "urllib.request.urlretrieve(\"http://textfiles.com/etext/NONFICTION/locke-essay-113.txt\", \"locke-essay-113.txt\")\n",
    "urllib.request.urlretrieve(\"http://textfiles.com/etext/FICTION/mobydick.txt\", \"mobydick.txt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import nvComp python module and check versions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "nvcomp version: 4.0.0\n",
      "nvcomp cuda version: 12030\n"
     ]
    }
   ],
   "source": [
    "from nvidia import nvcomp\n",
    "print(\"nvcomp version:\", nvcomp.__version__)\n",
    "print(\"nvcomp cuda version:\", nvcomp.__cuda_version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Zero-copy import host array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "ascending = np.arange(0, 4096, dtype=np.int32)\n",
    "nvarr_h = nvcomp.as_array(ascending)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'data': (94335900832880, False), 'strides': None, 'descr': [('', '<i4')], 'typestr': '<i4', 'shape': (4096,), 'version': 3}\n",
      "{'shape': (4096,), 'strides': None, 'typestr': '<i4', 'data': (94335900832880, False), 'version': 3}\n",
      "{'shape': (4096,), 'strides': None, 'typestr': '<i4', 'data': (94335900832880, False), 'version': 3, 'stream': 1}\n",
      "16384\n",
      "ArrayBufferKind.STRIDED_HOST\n",
      "1\n",
      "int32\n",
      "(4096,)\n",
      "(4,)\n",
      "4\n",
      "4096\n"
     ]
    }
   ],
   "source": [
    "print(ascending.__array_interface__)\n",
    "print(nvarr_h.__array_interface__)\n",
    "print(nvarr_h.__cuda_array_interface__)\n",
    "print(nvarr_h.buffer_size)\n",
    "print(nvarr_h.buffer_kind)\n",
    "print(nvarr_h.ndim)\n",
    "print(nvarr_h.dtype)\n",
    "print(nvarr_h.shape)\n",
    "print(nvarr_h.strides)\n",
    "print(nvarr_h.item_size)\n",
    "print(nvarr_h.size)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Zero-copy import device array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (4096,), 'typestr': '<i4', 'descr': [('', '<i4')], 'stream': 1, 'version': 3, 'strides': None, 'data': (34472984576, False)}\n",
      "{'shape': (4096,), 'strides': None, 'typestr': '<i4', 'data': (34472984576, False), 'version': 3, 'stream': 1}\n",
      "ArrayBufferKind.STRIDED_DEVICE\n",
      "1\n",
      "int32\n",
      "(4096,)\n",
      "(4,)\n",
      "4\n",
      "4096\n"
     ]
    }
   ],
   "source": [
    "data_gpu = cp.array(ascending)\n",
    "nvarr_d = nvcomp.as_array(data_gpu)\n",
    "print(data_gpu.__cuda_array_interface__)\n",
    "print(nvarr_d.__cuda_array_interface__)\n",
    "print(nvarr_d.buffer_kind)\n",
    "print(nvarr_d.ndim)\n",
    "print(nvarr_d.dtype)\n",
    "print(nvarr_d.shape)\n",
    "print(nvarr_d.strides)\n",
    "print(nvarr_d.item_size)\n",
    "print(nvarr_d.size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert host array to device array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (4096,), 'strides': None, 'typestr': '<i4', 'data': (47244640256, False), 'version': 3, 'stream': 1}\n"
     ]
    }
   ],
   "source": [
    "nvarr_d_cnv = nvarr_h.cuda()\n",
    "print(nvarr_d_cnv.__cuda_array_interface__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert device array to host array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (4096,), 'strides': None, 'typestr': '<i4', 'data': (12960415744, False), 'version': 3}\n"
     ]
    }
   ],
   "source": [
    "nvarr_h_cnv = nvarr_d.cpu()\n",
    "print(nvarr_h_cnv.__array_interface__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encode single array"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read text file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('mobydick.txt', \"rb\") as f: text = f.read()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`as_array` supports python buffer protocol so we can pass `text` directly to it "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (1205404,), 'strides': None, 'typestr': '|u1', 'data': (94335916846944, False), 'version': 3}\n"
     ]
    }
   ],
   "source": [
    "nvarr_txt_h = nvcomp.as_array(text)\n",
    "print (nvarr_txt_h.__array_interface__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Transfer to Device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (1205404,), 'strides': None, 'typestr': '|u1', 'data': (47244656640, False), 'version': 3, 'stream': 1}\n"
     ]
    }
   ],
   "source": [
    "nvarr_txt_d = nvarr_txt_h.cuda()\n",
    "print(nvarr_txt_d.__cuda_array_interface__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create Codec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lz4_codec = nvcomp.Codec(algorithm=\"LZ4\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Encode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "lz4_comp_arr = lz4_codec.encode(nvarr_txt_d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (824829,), 'strides': None, 'typestr': '|u1', 'data': (47248921600, False), 'version': 3, 'stream': 94335914071776}\n",
      "ArrayBufferKind.STRIDED_DEVICE\n"
     ]
    }
   ],
   "source": [
    "print(lz4_comp_arr.__cuda_array_interface__)\n",
    "print(lz4_comp_arr.buffer_kind)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Array supports python buffer protocol so we can pass it to write function directly   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with  open('mobydick.lz4', \"wb\") as f:  f.write(lz4_comp_arr.cpu())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Passing directly objects with standard interfaces to encode function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "lz4_comp_arr = lz4_codec.encode(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decode single array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "nv_dec_array = lz4_codec.decode(lz4_comp_arr)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (1205404,), 'strides': None, 'typestr': '|i1', 'data': (47253028864, False), 'version': 3, 'stream': 94335914071776}\n",
      "ArrayBufferKind.STRIDED_DEVICE\n"
     ]
    }
   ],
   "source": [
    "print(nv_dec_array.__cuda_array_interface__)\n",
    "print(nv_dec_array.buffer_kind)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compare decoded array with original and print first 400 positions of decoded array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Is decoded equal to original? True\n",
      "Preliminary Matter.  \n",
      "\n",
      "This text of Melville's Moby-Dick is based on the Hendricks House edition.\n",
      "It was prepared by Professor Eugene F. Irey at the University of Colorado.\n",
      "Any subsequent copies of this data must include this notice  \n",
      "and any publications resulting from analysis of this data must\n",
      "include reference to Professor Irey's work.\n",
      "\n",
      "Etymology  (Supplied by a late consumptive usher to a gra\n"
     ]
    }
   ],
   "source": [
    "print(\"Is decoded equal to original?\", bytes(nv_dec_array.cpu()) ==  bytes(nvarr_txt_h))\n",
    "print(bytes(nv_dec_array.cpu())[:400].decode())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Encode and decode with ANS codec, specified chunk size and checksum policy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "ans_codec = nvcomp.Codec(algorithm=\"ANS\", chunk_size=20, checksum_policy = nvcomp.ChecksumPolicy.COMPUTE_AND_VERIFY)\n",
    "ans_comp_arr = ans_codec.encode(nvarr_d)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define decode output type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "uint8\n",
      "uint32\n"
     ]
    }
   ],
   "source": [
    "ans_deco_arr_uint8 = ans_codec.decode(ans_comp_arr)\n",
    "ans_deco_arr_uint32 = ans_codec.decode(ans_comp_arr, '<u4')\n",
    "\n",
    "print(ans_deco_arr_uint8.dtype)\n",
    "print(ans_deco_arr_uint32.dtype)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Codec specific options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "gdeflate_ht_codec = nvcomp.Codec(algorithm=\"GDeflate\", algorithm_type = 0)\n",
    "gdeflate_lt_codec = nvcomp.Codec(algorithm=\"Gdeflate\", algorithm_type = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "300 µs ± 50.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "gdeflate_ht_comp_arr = gdeflate_ht_codec.encode(text[:4096])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "895 µs ± 24.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "gdeflate_lt_comp_arr = gdeflate_lt_codec.encode(text[:4096])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "high-throughput, low compression ratio (default) - compressed size: 2664\n",
      "low-throughput, high compression ratio - compressed size: 2520\n"
     ]
    }
   ],
   "source": [
    "gdeflate_ht_comp_arr = gdeflate_ht_codec.encode(text[:4096])\n",
    "gdeflate_lt_comp_arr = gdeflate_lt_codec.encode(text[:4096])\n",
    "print(\"high-throughput, low compression ratio (default) - compressed size:\", gdeflate_ht_comp_arr.size)\n",
    "print(\"low-throughput, high compression ratio - compressed size:\", gdeflate_lt_comp_arr.size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encode single array with multiple codecs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uncompressed size is 1205404\n",
      "Compressed size for LZ4 is 824829 (68.4%)\n",
      "Compressed size for Snappy is 863151 (71.6%)\n",
      "Compressed size for GDeflate is 622632 (51.7%)\n",
      "Compressed size for Deflate is 619051 (51.4%)\n",
      "Compressed size for Bitcomp is 986776 (81.9%)\n",
      "Compressed size for ANS is 737804 (61.2%)\n",
      "Compressed size for Zstd is 540745 (44.9%)\n",
      "Compressed size for Cascaded is 1205948 (100.0%)\n"
     ]
    }
   ],
   "source": [
    "print(\"Uncompressed size is\", nvarr_txt_d.buffer_size)\n",
    "alogs = [\"LZ4\", \"Snappy\", \"GDeflate\", \"Deflate\", \"Bitcomp\", \"ANS\", \"Zstd\",  \"Cascaded\"]\n",
    "encoded_files = []\n",
    "for algorithm in alogs:\n",
    "    codec = nvcomp.Codec(algorithm=algorithm)\n",
    "    com_arr = codec.encode(nvarr_txt_d)\n",
    "    print(\"Compressed size for\", algorithm, \"is\", com_arr.buffer_size, \"({:.1%})\".format(com_arr.buffer_size/nvarr_txt_d.buffer_size) )\n",
    "    with  open('mobydick.%s'% algorithm, \"wb\") as f:  f.write(com_arr.cpu())\n",
    "    encoded_files.append('mobydick.%s'% algorithm)\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decoding single arrays of various formats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Decoding mobydick.LZ4\n",
      "is equal to original? - True\n",
      "Decoding mobydick.Snappy\n",
      "is equal to original? - True\n",
      "Decoding mobydick.GDeflate\n",
      "is equal to original? - True\n",
      "Decoding mobydick.Deflate\n",
      "is equal to original? - True\n",
      "Decoding mobydick.Bitcomp\n",
      "is equal to original? - True\n",
      "Decoding mobydick.ANS\n",
      "is equal to original? - True\n",
      "Decoding mobydick.Zstd\n",
      "is equal to original? - True\n",
      "Decoding mobydick.Cascaded\n",
      "is equal to original? - True\n"
     ]
    }
   ],
   "source": [
    "codec = nvcomp.Codec()\n",
    "for file_name in encoded_files:\n",
    "    print(\"Decoding\", file_name,)\n",
    "    with open(file_name, \"rb\") as f: comp_bytes = f.read()\n",
    "    nv_dec_d = codec.decode(comp_bytes) # since it supports buffer protocol we can pass comp_bytes directly\n",
    "    print (\"is equal to original? -\", bytes(nv_dec_d.cpu()) ==  bytes(nvarr_txt_h))   "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Encoding and decoding with various Bitstream Kinds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uncompressed size is 1205404\n",
      "Compressed size for LZ4 with bitstream BitstreamKind.NVCOMP_NATIVE is 824829 (68.4%)\n",
      "is equal to original? - True\n",
      "Compressed size for LZ4 with bitstream BitstreamKind.RAW is 807075 (67.0%)\n",
      "is equal to original? - True\n",
      "Compressed size for LZ4 with bitstream BitstreamKind.WITH_UNCOMPRESSED_SIZE is 807079 (67.0%)\n",
      "is equal to original? - True\n",
      "Compressed size for Snappy with bitstream BitstreamKind.NVCOMP_NATIVE is 863143 (71.6%)\n",
      "is equal to original? - True\n",
      "Compressed size for Snappy with bitstream BitstreamKind.RAW is 854105 (70.9%)\n",
      "is equal to original? - True\n",
      "Compressed size for Snappy with bitstream BitstreamKind.WITH_UNCOMPRESSED_SIZE is 854113 (70.9%)\n",
      "is equal to original? - True\n",
      "Compressed size for Bitcomp with bitstream BitstreamKind.NVCOMP_NATIVE is 986776 (81.9%)\n",
      "is equal to original? - True\n",
      "Compressed size for Bitcomp with bitstream BitstreamKind.RAW is 985800 (81.8%)\n",
      "is equal to original? - True\n",
      "Compressed size for Bitcomp with bitstream BitstreamKind.WITH_UNCOMPRESSED_SIZE is 985808 (81.8%)\n",
      "is equal to original? - True\n",
      "Compressed size for ANS with bitstream BitstreamKind.NVCOMP_NATIVE is 737802 (61.2%)\n",
      "is equal to original? - True\n",
      "Compressed size for ANS with bitstream BitstreamKind.RAW is 680096 (56.4%)\n",
      "is equal to original? - True\n",
      "Compressed size for ANS with bitstream BitstreamKind.WITH_UNCOMPRESSED_SIZE is 680104 (56.4%)\n",
      "is equal to original? - True\n",
      "Compressed size for Zstd with bitstream BitstreamKind.NVCOMP_NATIVE is 540745 (44.9%)\n",
      "is equal to original? - True\n",
      "Compressed size for Zstd with bitstream BitstreamKind.RAW is 527380 (43.8%)\n",
      "is equal to original? - True\n",
      "Compressed size for Zstd with bitstream BitstreamKind.WITH_UNCOMPRESSED_SIZE is 527388 (43.8%)\n",
      "is equal to original? - True\n",
      "Compressed size for Cascaded with bitstream BitstreamKind.NVCOMP_NATIVE is 1205948 (100.0%)\n",
      "is equal to original? - True\n",
      "Compressed size for Cascaded with bitstream BitstreamKind.RAW is 1205412 (100.0%)\n",
      "is equal to original? - True\n",
      "Compressed size for Cascaded with bitstream BitstreamKind.WITH_UNCOMPRESSED_SIZE is 1205420 (100.0%)\n",
      "is equal to original? - True\n"
     ]
    }
   ],
   "source": [
    "print(\"Uncompressed size is\", nvarr_txt_d.buffer_size)\n",
    "algos = [\"LZ4\", \"Snappy\", \"Bitcomp\", \"ANS\", \"Zstd\",  \"Cascaded\"]\n",
    "bitstreams = [\n",
    "    nvcomp.BitstreamKind.NVCOMP_NATIVE,\n",
    "    nvcomp.BitstreamKind.RAW,\n",
    "    nvcomp.BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
    "]\n",
    "\n",
    "for algorithm in algos:\n",
    "    for bitstream_kind in bitstreams:\n",
    "        codec = nvcomp.Codec(algorithm=algorithm, bitstream_kind=bitstream_kind)\n",
    "        comp_arr = codec.encode(nvarr_txt_d)\n",
    "        comp_ratio = comp_arr.buffer_size/nvarr_txt_d.buffer_size\n",
    "        print(\"Compressed size for\", algorithm, \"with bitstream\", bitstream_kind, \"is\", comp_arr.buffer_size, \"({:.1%})\".format(comp_ratio))\n",
    "        decomp_array = codec.decode(comp_arr)\n",
    "        print (\"is equal to original? -\", bytes(decomp_array.cpu()) ==  bytes(nvarr_txt_d.cpu()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Batch encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "nv_uncomp_arrays = []\n",
    "for fn in ['mobydick.txt', 'locke-essay-113.txt']:\n",
    "    with open(fn, \"rb\") as f: text = f.read()\n",
    "    nv_uncomp_arrays.append( nvcomp.as_array(text).cuda())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "nv_comp_arrays = lz4_comp_arr = lz4_codec.encode(nv_uncomp_arrays)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'shape': (1205404,), 'strides': None, 'typestr': '|u1', 'data': (47647160320, False), 'version': 3, 'stream': 1}\n",
      "{'shape': (824829,), 'strides': None, 'typestr': '|u1', 'data': (47625135104, False), 'version': 3, 'stream': 94335914071776}\n",
      "{'shape': (1605768,), 'strides': None, 'typestr': '|u1', 'data': (47648366080, False), 'version': 3, 'stream': 1}\n",
      "{'shape': (978812,), 'strides': None, 'typestr': '|u1', 'data': (47247113728, False), 'version': 3, 'stream': 94335914071776}\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(nv_uncomp_arrays)):\n",
    "    print(nv_uncomp_arrays[i].__cuda_array_interface__)\n",
    "    print(nv_comp_arrays[i].__cuda_array_interface__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Batch decoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "nv_dec_arrays = lz4_codec.decode(nv_comp_arrays)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Compare with original"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Is decoded equal to original? True\n",
      "\n",
      " Preliminary Matter.  \n",
      "\n",
      "This text of Melville's Moby-Dick is based on the Hendricks House edition.\n",
      "It was prepared by Professor Eugene F. Irey at the University of Colorado.\n",
      "Any subsequent copies of this data must include this notice  \n",
      "and any publications resulting from analysis of this data must\n",
      "include reference to Professor Irey's work.\n",
      "\n",
      "Etymology  (Supplied by a late consumptive usher to a gra\n",
      "Is decoded equal to original? True\n",
      "\n",
      "                                       1690\n",
      "\n",
      "                    AN ESSAY CONCERNING HUMAN UNDERSTANDING\n",
      "\n",
      "                                 by John Locke\n",
      "\n",
      "                       TO THE RIGHT HONOURABLE\n",
      "\n",
      "            LORD THOMAS, EARL OF PEMBROKE AND MONTGOMERY,\n",
      "\n",
      "                      BARRON HERBERT OF CARDIFF,\n",
      "\n",
      "      LORD ROSS, OF KENDAL, PAR, FITZHUGH, MARMION, ST. QUINTIN,\n",
      "\n",
      "          AND SHURLAND; \n"
     ]
    }
   ],
   "source": [
    "for i in range(len(nv_dec_arrays)):\n",
    "    print(\"Is decoded equal to original?\",  bytes(nv_uncomp_arrays[i].cpu()) ==  bytes(nv_dec_arrays[i].cpu()))\n",
    "    print(\"\\n\", bytes(nv_dec_arrays[i].cpu())[:400].decode())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Batch decoding and encoding various formats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Algorithm: LZ4 BitstreamKind: BitstreamKind.NVCOMP_NATIVE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 824829 (68.4%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 978812 (61.0%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: LZ4 BitstreamKind: BitstreamKind.RAW\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 807075 (67.0%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 964181 (60.0%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: LZ4 BitstreamKind: BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 807079 (67.0%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 964185 (60.0%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Snappy BitstreamKind: BitstreamKind.NVCOMP_NATIVE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 863151 (71.6%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 960232 (59.8%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Snappy BitstreamKind: BitstreamKind.RAW\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 854105 (70.9%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 945120 (58.9%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Snappy BitstreamKind: BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 854113 (70.9%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 945128 (58.9%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Bitcomp BitstreamKind: BitstreamKind.NVCOMP_NATIVE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 986776 (81.9%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 1300340 (81.0%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Bitcomp BitstreamKind: BitstreamKind.RAW\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 985800 (81.8%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 1299060 (80.9%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Bitcomp BitstreamKind: BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 985808 (81.8%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 1299068 (80.9%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Cascaded BitstreamKind: BitstreamKind.NVCOMP_NATIVE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 1205948 (100.0%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 1606456 (100.0%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Cascaded BitstreamKind: BitstreamKind.RAW\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 1205412 (100.0%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 1605776 (100.0%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Cascaded BitstreamKind: BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 1205420 (100.0%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 1605784 (100.0%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Zstd BitstreamKind: BitstreamKind.NVCOMP_NATIVE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 540745 (44.9%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 613497 (38.2%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Zstd BitstreamKind: BitstreamKind.RAW\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 527380 (43.8%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 596140 (37.1%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: Zstd BitstreamKind: BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 527388 (43.8%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 596148 (37.1%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: ANS BitstreamKind: BitstreamKind.NVCOMP_NATIVE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 737804 (61.2%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 957184 (59.6%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: ANS BitstreamKind: BitstreamKind.RAW\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 680094 (56.4%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 879728 (54.8%)\n",
      "   -- Is decoded equal to original? True\n",
      "Algorithm: ANS BitstreamKind: BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
      " - File # 0\n",
      "   -- Uncompressed size: 1205404\n",
      "   -- Compressed size: 680102 (56.4%)\n",
      "   -- Is decoded equal to original? True\n",
      " - File # 1\n",
      "   -- Uncompressed size: 1605768\n",
      "   -- Compressed size: 879736 (54.8%)\n",
      "   -- Is decoded equal to original? True\n"
     ]
    }
   ],
   "source": [
    "algos = [\"LZ4\", \"Snappy\", \"Bitcomp\", \"Cascaded\", \"Zstd\", \"ANS\"]\n",
    "bitstreams = [\n",
    "    nvcomp.BitstreamKind.NVCOMP_NATIVE,\n",
    "    nvcomp.BitstreamKind.RAW,\n",
    "    nvcomp.BitstreamKind.WITH_UNCOMPRESSED_SIZE\n",
    "]\n",
    "for algorithm in algos:\n",
    "    for bitstream_kind in bitstreams:\n",
    "        print(\"Algorithm:\", algorithm, \"BitstreamKind:\", bitstream_kind)\n",
    "        codec = nvcomp.Codec(algorithm=algorithm, bitstream_kind=bitstream_kind)\n",
    "        nv_comp_arrays = codec.encode(nv_uncomp_arrays)\n",
    "        nv_dec_arrays = codec.decode(nv_comp_arrays)\n",
    "        for i in range(len(nv_dec_arrays)):\n",
    "            print(\" - File #\", i)\n",
    "            print(\"   -- Uncompressed size:\", nv_uncomp_arrays[i].buffer_size)\n",
    "            print(\"   -- Compressed size:\", nv_comp_arrays[i].buffer_size, \"({:.1%})\".format(nv_comp_arrays[i].buffer_size/nv_uncomp_arrays[i].buffer_size) )\n",
    "            print(\"   -- Is decoded equal to original?\",  bytes(nv_uncomp_arrays[i].cpu()) == bytes(nv_dec_arrays[i].cpu()))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Checksum example\n",
    "\n",
    "Checksums can be used only with `nvcomp.BitstreamKind.NVCOMP_NATIVE` bitstream kind.\n",
    "They are computed before the data is compressed and after it is decompressed\n",
    "and validates that the decompressed data is equal to original.\n",
    "\n",
    "Checksum can be used to prevent silent corruption, which can happen when data is corrupted but the decoding finishes without errors\n",
    "or when there is a bug in encoding or decoding implementation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Following example will show how to use them.\n",
    "Create codec which computes and verifies checksums:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "codec = nvcomp.Codec(\n",
    "    algorithm=\"GDeflate\",\n",
    "    bitstream_kind=nvcomp.BitstreamKind.NVCOMP_NATIVE,\n",
    "    checksum_policy=nvcomp.ChecksumPolicy.COMPUTE_AND_VERIFY,\n",
    ")\n",
    "nv_comp_arrays = codec.encode(nv_uncomp_arrays)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Verify that there are no errors when data is correct:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Is array 0 equal to original? - True\n",
      "Is array 1 equal to original? - True\n"
     ]
    }
   ],
   "source": [
    "nv_dec_arrays = codec.decode(nv_comp_arrays)\n",
    "for i in range(len(nv_dec_arrays)):\n",
    "    print (f\"Is array {i} equal to original? -\", bytes(nv_dec_arrays[i].cpu()) ==  bytes(nv_uncomp_arrays[i].cpu()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Introduce artificial error in data and decode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "array_with_error = 1\n",
    "\n",
    "cupy_array = cp.asarray(nv_comp_arrays[array_with_error])\n",
    "cupy_array[1000] = cupy_array[1000] ^ 176\n",
    "\n",
    "nv_comp_arrays[array_with_error] = nvcomp.as_array(cupy_array)\n",
    "\n",
    "nv_dec_arrays = codec.decode(nv_comp_arrays)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check which arrays were affected in decoding (should be only one).\n",
    "Checksums are only validated during the first access to the data, any following access will skip that check."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Is array 0 equal to original? - True\n",
      "error with decoding array 1: Checksum doesn't match.\n",
      "Rerunning: Is array 1 equal to original? - False\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(nv_dec_arrays)):\n",
    "    try:\n",
    "        print (f\"Is array {i} equal to original? -\", bytes(nv_dec_arrays[i].cpu()) ==  bytes(nv_uncomp_arrays[i].cpu()))\n",
    "    except RuntimeError as err:\n",
    "        print(f\"error with decoding array {i}: {err}\")\n",
    "        print(f\"Rerunning: Is array {i} equal to original? -\", bytes(nv_dec_arrays[i].cpu()) ==  bytes(nv_uncomp_arrays[i].cpu()))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.undefined"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
